Data Preparation and EDA

library(dplyr)
library(MASS)
library(ggplot2)
library(GGally)
library(leaps)
library(tidyverse)
library(performance)
library(patchwork)
library(interactions)
library(modelr)
library(caret)

crime_df = 
  read_csv("cdi.csv") %>% 
  mutate(
    region = factor(region),
    pop_density = pop / area,
    beds_density_1000 = beds / pop * 1000,
    docs_density = docs / pop,
    crm_1000 = 1000 * crimes / pop
  ) %>% 
  dplyr::select(-crimes, -id,-area, -pop, -beds, -docs) %>% 
  dplyr::select(crm_1000, everything()) %>%
  drop_na()

Independent Variable Selection

First examine the marginal distributions of each variable.

#Load data
crime_df1 = 
  read_csv("cdi.csv") %>% 
  mutate(
    region = factor(region),
    pop_density = pop / area,
    crm_1000 = 1000 * crimes / pop
  ) %>%
  dplyr::select(-pop, -id, -cty, -state, -area, -crimes)

To begin with, we take a look on the distribution of crime rate per 1000 population. We find that Kings County in NY is an extreme outlier with close to 300 crime rate, so we may not include this data in the following analysis.

First examine the marginal distributions of each variable.

# graph
compute_plist = function(x) {
  if (is.factor(x)) {
    pl = crime_df1 %>% 
    ggplot(aes(x = x, y = crm_1000)) +
    geom_boxplot() +
    labs(
      x = '',
      y = "Crime Rate",
    )
  } else {
    pl = crime_df1 %>% 
    ggplot(aes(x = x, y = crm_1000)) +
    geom_point() +
    geom_smooth(method = "lm") +
    labs(
      x = '',
      y = "Crime Rate",
    )
  }
  return(pl)
}

plist = crime_df1 %>%
  dplyr::select(-crm_1000) %>%
  map(compute_plist)

labels = crime_df1 %>%
  dplyr::select(-crm_1000) %>%
  names()

for (i in 1:length(plist)) {
  plist[[i]] = plist[[i]] + labs(x = labels[[i]])
}

wrap_plots(plist, ncol = 4)

Compare p-value of all variables

compute_p_value = function(x) {
  lm_data = lm(crm_1000 ~ x, data = crime_df1)
  if (is.factor(x)) {
    return( lm_data %>% broom::tidy() %>% dplyr::select(p.value) %>% .[2:4,1])
  }
  return(lm_data %>% broom::tidy() %>% dplyr::select(p.value) %>% .[2,1])
}

labels_list <- c("pop18","pop65","docs","beds","hsgrad","bagrad","poverty","unemp","pcincome","totalinc","region_1","region_2", "region_3", "pop_density")

p_values = crime_df1 %>%
  dplyr::select(-crm_1000) %>%
  map_df(compute_p_value) %>%
  mutate(Variables = labels_list) 

p_values
## # A tibble: 14 × 2
##     p.value Variables  
##       <dbl> <chr>      
##  1 5.75e- 5 pop18      
##  2 1.64e- 1 pop65      
##  3 4.31e-11 docs       
##  4 1.44e-17 beds       
##  5 1.60e- 6 hsgrad     
##  6 4.23e- 1 bagrad     
##  7 8.92e-26 poverty    
##  8 3.81e- 1 unemp      
##  9 9.27e- 2 pcincome   
## 10 1.32e- 6 totalinc   
## 11 4.06e- 3 region_1   
## 12 6.09e-19 region_2   
## 13 2.33e- 7 region_3   
## 14 8.66e-27 pop_density

Backward & Forward & Stepwise

crime_df2 = 
  read_csv("cdi.csv") %>% 
  mutate(
    region = factor(region),
    pop_density = pop / area,
    crm_1000 = 1000 * crimes / pop,
    beds_density = beds / pop,
    docs_density = docs / pop
  ) %>% 
  dplyr::select(-crimes, -id, -area, -pop, -cty, -state, -beds, -docs, -region) %>%
  dplyr::select(crm_1000, everything())
## Rows: 440 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): cty, state
## dbl (15): id, area, pop, pop18, pop65, docs, beds, crimes, hsgrad, bagrad, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# fit regression using all predictors
mult.fit = lm(crm_1000 ~ ., data = crime_df2)

step(mult.fit, direction = 'backward')
## Start:  AIC=2659.33
## crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp + 
##     pcincome + totalinc + pop_density + beds_density + docs_density
## 
##                Df Sum of Sq    RSS    AIC
## - bagrad        1       0.3 175634 2657.3
## - docs_density  1       5.4 175640 2657.3
## - pcincome      1      98.1 175732 2657.6
## - pop65         1     175.9 175810 2657.8
## - pop18         1     350.4 175985 2658.2
## - hsgrad        1     604.4 176239 2658.8
## <none>                      175634 2659.3
## - unemp         1    1376.7 177011 2660.8
## - beds_density  1    1776.4 177411 2661.8
## - totalinc      1    3163.2 178797 2665.2
## - poverty       1   20581.5 196216 2706.1
## - pop_density   1   30494.9 206129 2727.8
## 
## Step:  AIC=2657.33
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome + 
##     totalinc + pop_density + beds_density + docs_density
## 
##                Df Sum of Sq    RSS    AIC
## - docs_density  1       5.1 175640 2655.3
## - pop65         1     175.6 175810 2655.8
## - pcincome      1     195.1 175830 2655.8
## - pop18         1     470.2 176105 2656.5
## <none>                      175634 2657.3
## - hsgrad        1     868.0 176502 2657.5
## - unemp         1    1499.7 177134 2659.1
## - beds_density  1    1932.7 177567 2660.1
## - totalinc      1    3188.6 178823 2663.2
## - poverty       1   26123.4 201758 2716.3
## - pop_density   1   30577.1 206212 2725.9
## 
## Step:  AIC=2655.34
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome + 
##     totalinc + pop_density + beds_density
## 
##                Df Sum of Sq    RSS    AIC
## - pop65         1     175.5 175815 2653.8
## - pcincome      1     204.3 175844 2653.8
## - pop18         1     472.0 176112 2654.5
## <none>                      175640 2655.3
## - hsgrad        1     864.2 176504 2655.5
## - unemp         1    1495.7 177135 2657.1
## - totalinc      1    3183.5 178823 2661.2
## - beds_density  1    3218.9 178859 2661.3
## - poverty       1   26501.7 202141 2715.2
## - pop_density   1   30736.7 206376 2724.3
## 
## Step:  AIC=2653.78
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + pcincome + totalinc + 
##     pop_density + beds_density
## 
##                Df Sum of Sq    RSS    AIC
## - pcincome      1     227.1 176042 2652.3
## <none>                      175815 2653.8
## - hsgrad        1     992.1 176807 2654.3
## - pop18         1    1252.2 177067 2654.9
## - unemp         1    1656.9 177472 2655.9
## - beds_density  1    3119.9 178935 2659.5
## - totalinc      1    3153.5 178969 2659.6
## - poverty       1   29316.3 205131 2719.6
## - pop_density   1   30562.1 206377 2722.3
## 
## Step:  AIC=2652.35
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + pop_density + 
##     beds_density
## 
##                Df Sum of Sq    RSS    AIC
## <none>                      176042 2652.3
## - pop18         1      1110 177153 2653.1
## - hsgrad        1      1331 177373 2653.7
## - unemp         1      1609 177651 2654.3
## - beds_density  1      3580 179622 2659.2
## - totalinc      1      4257 180299 2660.9
## - poverty       1     33494 209537 2727.0
## - pop_density   1     35476 211518 2731.1
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + 
##     pop_density + beds_density, data = crime_df2)
## 
## Coefficients:
##  (Intercept)         pop18        hsgrad       poverty         unemp  
##   -1.618e+01     4.235e-01     4.017e-01     2.912e+00    -1.079e+00  
##     totalinc   pop_density  beds_density  
##    2.560e-04     4.477e-03     1.629e+03
step(mult.fit, direction = 'forward')
## Start:  AIC=2659.33
## crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp + 
##     pcincome + totalinc + pop_density + beds_density + docs_density
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + 
##     unemp + pcincome + totalinc + pop_density + beds_density + 
##     docs_density, data = crime_df2)
## 
## Coefficients:
##  (Intercept)         pop18         pop65        hsgrad        bagrad  
##   -1.192e+01     3.427e-01    -2.230e-01     3.368e-01     8.503e-03  
##      poverty         unemp      pcincome      totalinc   pop_density  
##    2.970e+00    -1.048e+00     2.649e-04     2.363e-04     4.391e-03  
## beds_density  docs_density  
##    1.765e+03    -1.299e+02
step(mult.fit, direction = 'both')
## Start:  AIC=2659.33
## crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp + 
##     pcincome + totalinc + pop_density + beds_density + docs_density
## 
##                Df Sum of Sq    RSS    AIC
## - bagrad        1       0.3 175634 2657.3
## - docs_density  1       5.4 175640 2657.3
## - pcincome      1      98.1 175732 2657.6
## - pop65         1     175.9 175810 2657.8
## - pop18         1     350.4 175985 2658.2
## - hsgrad        1     604.4 176239 2658.8
## <none>                      175634 2659.3
## - unemp         1    1376.7 177011 2660.8
## - beds_density  1    1776.4 177411 2661.8
## - totalinc      1    3163.2 178797 2665.2
## - poverty       1   20581.5 196216 2706.1
## - pop_density   1   30494.9 206129 2727.8
## 
## Step:  AIC=2657.33
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome + 
##     totalinc + pop_density + beds_density + docs_density
## 
##                Df Sum of Sq    RSS    AIC
## - docs_density  1       5.1 175640 2655.3
## - pop65         1     175.6 175810 2655.8
## - pcincome      1     195.1 175830 2655.8
## - pop18         1     470.2 176105 2656.5
## <none>                      175634 2657.3
## - hsgrad        1     868.0 176502 2657.5
## - unemp         1    1499.7 177134 2659.1
## + bagrad        1       0.3 175634 2659.3
## - beds_density  1    1932.7 177567 2660.1
## - totalinc      1    3188.6 178823 2663.2
## - poverty       1   26123.4 201758 2716.3
## - pop_density   1   30577.1 206212 2725.9
## 
## Step:  AIC=2655.34
## crm_1000 ~ pop18 + pop65 + hsgrad + poverty + unemp + pcincome + 
##     totalinc + pop_density + beds_density
## 
##                Df Sum of Sq    RSS    AIC
## - pop65         1     175.5 175815 2653.8
## - pcincome      1     204.3 175844 2653.8
## - pop18         1     472.0 176112 2654.5
## <none>                      175640 2655.3
## - hsgrad        1     864.2 176504 2655.5
## - unemp         1    1495.7 177135 2657.1
## + docs_density  1       5.1 175634 2657.3
## + bagrad        1       0.0 175640 2657.3
## - totalinc      1    3183.5 178823 2661.2
## - beds_density  1    3218.9 178859 2661.3
## - poverty       1   26501.7 202141 2715.2
## - pop_density   1   30736.7 206376 2724.3
## 
## Step:  AIC=2653.78
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + pcincome + totalinc + 
##     pop_density + beds_density
## 
##                Df Sum of Sq    RSS    AIC
## - pcincome      1     227.1 176042 2652.3
## <none>                      175815 2653.8
## - hsgrad        1     992.1 176807 2654.3
## - pop18         1    1252.2 177067 2654.9
## + pop65         1     175.5 175640 2655.3
## + docs_density  1       5.0 175810 2655.8
## + bagrad        1       0.5 175815 2655.8
## - unemp         1    1656.9 177472 2655.9
## - beds_density  1    3119.9 178935 2659.5
## - totalinc      1    3153.5 178969 2659.6
## - poverty       1   29316.3 205131 2719.6
## - pop_density   1   30562.1 206377 2722.3
## 
## Step:  AIC=2652.35
## crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + pop_density + 
##     beds_density
## 
##                Df Sum of Sq    RSS    AIC
## <none>                      176042 2652.3
## - pop18         1      1110 177153 2653.1
## - hsgrad        1      1331 177373 2653.7
## + pcincome      1       227 175815 2653.8
## + pop65         1       198 175844 2653.8
## + bagrad        1       114 175929 2654.1
## + docs_density  1        17 176025 2654.3
## - unemp         1      1609 177651 2654.3
## - beds_density  1      3580 179622 2659.2
## - totalinc      1      4257 180299 2660.9
## - poverty       1     33494 209537 2727.0
## - pop_density   1     35476 211518 2731.1
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + 
##     pop_density + beds_density, data = crime_df2)
## 
## Coefficients:
##  (Intercept)         pop18        hsgrad       poverty         unemp  
##   -1.618e+01     4.235e-01     4.017e-01     2.912e+00    -1.079e+00  
##     totalinc   pop_density  beds_density  
##    2.560e-04     4.477e-03     1.629e+03

Test Based Procedures

mat = as.matrix(crime_df2)
# Printing the 2 best models of each size, using the Cp criterion:
leaps(x = mat[,2:12], y = mat[,1], nbest = 2, method = "Cp")
## $which
##        1     2     3     4     5     6     7     8     9     A     B
## 1  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## 1  FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 2  FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## 2  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## 3  FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE
## 3  FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## 4  FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
## 4  FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
## 5  FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 5  FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE
## 6  FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 6   TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 7   TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 7  FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 8   TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## 8   TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 9   TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## 9   TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 10  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 10  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## 11  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 
## $label
##  [1] "(Intercept)" "1"           "2"           "3"           "4"          
##  [6] "5"           "6"           "7"           "8"           "9"          
## [11] "A"           "B"          
## 
## $size
##  [1]  2  2  3  3  4  4  5  5  6  6  7  7  8  8  9  9 10 10 11 11 12
## 
## $Cp
##  [1] 178.523611 185.054464  43.329279 121.797655  22.962369  24.812950
##  [7]  14.542755  15.594180   9.242835   9.247756   5.700027   6.237706
## [13]   4.994133   5.444228   6.440778   6.510977   8.013131   8.242887
## [19]  10.000678  10.013108  12.000000
# Printing the 2 best models of each size, using the adjusted R^2 criterion:
leaps(x = mat[,2:12], y = mat[,1], nbest = 2, method = "adjr2")
## $which
##        1     2     3     4     5     6     7     8     9     A     B
## 1  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## 1  FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## 2  FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## 2  FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## 3  FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE
## 3  FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## 4  FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
## 4  FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
## 5  FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 5  FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE
## 6  FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 6   TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 7   TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 7  FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 8   TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## 8   TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 9   TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## 9   TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## 10  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 10  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## 11  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 
## $label
##  [1] "(Intercept)" "1"           "2"           "3"           "4"          
##  [6] "5"           "6"           "7"           "8"           "9"          
## [11] "A"           "B"          
## 
## $size
##  [1]  2  2  3  3  4  4  5  5  6  6  7  7  8  8  9  9 10 10 11 11 12
## 
## $adjr2
##  [1] 0.2290554 0.2208622 0.3998009 0.3011339 0.4266132 0.4242809 0.4384570
##  [8] 0.4371289 0.4464056 0.4463994 0.4521611 0.4514787 0.4543347 0.4537622
## [15] 0.4537742 0.4536847 0.4530503 0.4527567 0.4517914 0.4517754 0.4505114
# Function regsubsets() performs a subset selection by identifying the "best" model that contains
# a certain number of predictors. By default "best" is chosen using SSE/RSS (smaller is better)
b = regsubsets(crm_1000 ~ ., data = crime_df2)
rs = summary(b)

# plot of Cp and Adj-R2 as functions of parameters
par(mfrow=c(1,2))

plot(2:9, rs$cp, xlab="No of parameters", ylab="Cp Statistic")
abline(0,1)
#6
plot(2:9, rs$adjr2, xlab="No of parameters", ylab="Adj R2")

#6
Model_backward =
lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)
summary(Model_backward)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome + 
##     totalinc + region + pop_density + beds_density_1000, data = crime_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -50.29 -11.31  -0.85  10.43  76.83 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.658e+01  2.462e+01  -2.704 0.007114 ** 
## pop18              8.036e-01  2.861e-01   2.808 0.005208 ** 
## hsgrad             5.786e-01  2.641e-01   2.191 0.028998 *  
## bagrad            -6.045e-01  2.831e-01  -2.135 0.033329 *  
## poverty            2.061e+00  3.618e-01   5.696 2.28e-08 ***
## pcincome           1.102e-03  4.714e-04   2.338 0.019866 *  
## totalinc           1.996e-04  7.622e-05   2.619 0.009136 ** 
## region2            9.145e+00  2.668e+00   3.428 0.000668 ***
## region3            2.710e+01  2.533e+00  10.699  < 2e-16 ***
## region4            2.127e+01  3.080e+00   6.906 1.81e-11 ***
## pop_density        4.941e-03  4.494e-04  10.995  < 2e-16 ***
## beds_density_1000  2.484e+00  5.042e-01   4.926 1.20e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.88 on 428 degrees of freedom
## Multiple R-squared:  0.5827, Adjusted R-squared:  0.572 
## F-statistic: 54.33 on 11 and 428 DF,  p-value: < 2.2e-16
BIC(Model_backward)
## [1] 3853.23
Model_forward =
  lm(formula = crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + unemp + pcincome + totalinc + region + pop_density + beds_density_1000 + docs_density, data = crime_df)
BIC(Model_forward)
## [1] 3869.53
summary(Model_forward)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + pop65 + hsgrad + bagrad + poverty + 
##     unemp + pcincome + totalinc + region + pop_density + beds_density_1000 + 
##     docs_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.570 -11.549  -0.911  10.417  75.670 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.862e+01  2.757e+01  -2.489  0.01319 *  
## pop18              7.160e-01  3.327e-01   2.152  0.03198 *  
## pop65             -1.993e-01  3.072e-01  -0.649  0.51695    
## hsgrad             6.074e-01  2.706e-01   2.245  0.02529 *  
## bagrad            -4.964e-01  2.988e-01  -1.662  0.09733 .  
## poverty            1.879e+00  3.887e-01   4.836 1.86e-06 ***
## unemp              6.015e-01  5.345e-01   1.125  0.26109    
## pcincome           1.029e-03  4.846e-04   2.123  0.03432 *  
## totalinc           2.072e-04  7.652e-05   2.707  0.00705 ** 
## region2            9.098e+00  2.747e+00   3.312  0.00101 ** 
## region3            2.785e+01  2.673e+00  10.419  < 2e-16 ***
## region4            2.160e+01  3.140e+00   6.879 2.16e-11 ***
## pop_density        5.006e-03  4.536e-04  11.037  < 2e-16 ***
## beds_density_1000  3.138e+00  7.988e-01   3.928  0.00010 ***
## docs_density      -6.554e+02  1.025e+03  -0.639  0.52298    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.9 on 425 degrees of freedom
## Multiple R-squared:  0.5845, Adjusted R-squared:  0.5708 
## F-statistic: 42.71 on 14 and 425 DF,  p-value: < 2.2e-16
Model_stepwise = 
  lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)
summary(Model_stepwise)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + bagrad + poverty + pcincome + 
##     totalinc + region + pop_density + beds_density_1000, data = crime_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -50.29 -11.31  -0.85  10.43  76.83 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6.658e+01  2.462e+01  -2.704 0.007114 ** 
## pop18              8.036e-01  2.861e-01   2.808 0.005208 ** 
## hsgrad             5.786e-01  2.641e-01   2.191 0.028998 *  
## bagrad            -6.045e-01  2.831e-01  -2.135 0.033329 *  
## poverty            2.061e+00  3.618e-01   5.696 2.28e-08 ***
## pcincome           1.102e-03  4.714e-04   2.338 0.019866 *  
## totalinc           1.996e-04  7.622e-05   2.619 0.009136 ** 
## region2            9.145e+00  2.668e+00   3.428 0.000668 ***
## region3            2.710e+01  2.533e+00  10.699  < 2e-16 ***
## region4            2.127e+01  3.080e+00   6.906 1.81e-11 ***
## pop_density        4.941e-03  4.494e-04  10.995  < 2e-16 ***
## beds_density_1000  2.484e+00  5.042e-01   4.926 1.20e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.88 on 428 degrees of freedom
## Multiple R-squared:  0.5827, Adjusted R-squared:  0.572 
## F-statistic: 54.33 on 11 and 428 DF,  p-value: < 2.2e-16
BIC(Model_stepwise)
## [1] 3853.23
cp1 = 
  lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + pop_density + docs_density, data = crime_df2)
BIC(cp1)
## [1] 3943.759
summary(cp1)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + hsgrad + poverty + unemp + totalinc + 
##     pop_density + docs_density, data = crime_df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.162 -12.313  -1.754  13.102  69.026 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.927e+00  2.003e+01  -0.196  0.84467    
## pop18         3.158e-01  2.575e-01   1.227  0.22066    
## hsgrad        3.165e-01  2.253e-01   1.405  0.16083    
## poverty       3.097e+00  3.099e-01   9.993  < 2e-16 ***
## unemp        -1.280e+00  5.357e-01  -2.390  0.01726 *  
## totalinc      2.283e-04  8.001e-05   2.853  0.00454 ** 
## pop_density   4.437e-03  4.928e-04   9.005  < 2e-16 ***
## docs_density  1.577e+03  7.173e+02   2.198  0.02844 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.28 on 432 degrees of freedom
## Multiple R-squared:  0.4582, Adjusted R-squared:  0.4494 
## F-statistic: 52.19 on 7 and 432 DF,  p-value: < 2.2e-16
cp2 = 
  lm(formula = crm_1000 ~ pop65 + hsgrad + poverty + unemp + totalinc + pop_density + docs_density, data = crime_df2)
BIC(cp2)
## [1] 3944.684
summary(cp2)
## 
## Call:
## lm(formula = crm_1000 ~ pop65 + hsgrad + poverty + unemp + totalinc + 
##     pop_density + docs_density, data = crime_df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.172 -12.499  -1.668  12.860  69.984 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.666e+00  2.177e+01   0.260  0.79474    
## pop65        -2.050e-01  2.660e-01  -0.771  0.44132    
## hsgrad        3.365e-01  2.284e-01   1.473  0.14139    
## poverty       3.132e+00  3.117e-01  10.048  < 2e-16 ***
## unemp        -1.327e+00  5.350e-01  -2.480  0.01353 *  
## totalinc      2.274e-04  8.016e-05   2.837  0.00477 ** 
## pop_density   4.489e-03  4.911e-04   9.140  < 2e-16 ***
## docs_density  1.732e+03  7.253e+02   2.388  0.01737 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.3 on 432 degrees of freedom
## Multiple R-squared:  0.457,  Adjusted R-squared:  0.4482 
## F-statistic: 51.95 on 7 and 432 DF,  p-value: < 2.2e-16

select based on marginal distribution

#linear regression model with full predictors
fit_no_inte = lm(crm_1000 ~ pop18 +totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density, data = crime_df)
summary(fit_no_inte)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad + 
##     beds_density_1000 + docs_density + region + pop_density, 
##     data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.536 -11.453  -0.606  10.089  75.406 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.471e+01  1.601e+01  -1.543 0.123503    
## pop18              4.176e-01  2.331e-01   1.792 0.073901 .  
## totalinc           2.473e-04  7.250e-05   3.411 0.000708 ***
## poverty            1.607e+00  3.141e-01   5.116 4.71e-07 ***
## hsgrad             3.248e-01  2.025e-01   1.604 0.109452    
## beds_density_1000  2.921e+00  7.242e-01   4.034 6.48e-05 ***
## docs_density      -5.132e+02  9.134e+02  -0.562 0.574470    
## region2            9.001e+00  2.625e+00   3.429 0.000664 ***
## region3            2.586e+01  2.496e+00  10.364  < 2e-16 ***
## region4            2.071e+01  3.074e+00   6.736 5.24e-11 ***
## pop_density        5.145e-03  4.424e-04  11.629  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.97 on 429 degrees of freedom
## Multiple R-squared:  0.5772, Adjusted R-squared:  0.5674 
## F-statistic: 58.57 on 10 and 429 DF,  p-value: < 2.2e-16
fit_no = lm(crm_1000 ~ pop18 +totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density + pcincome, data = crime_df)
small = lm(crm_1000 ~ pop18 +totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density , data = crime_df)

anova(small, fit_no)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density + pcincome
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    429 138602                           
## 2    428 137872  1    729.97 2.2661  0.133

Next using partial ANOVA to test whether large model is superior

no pop_density

fit_1 = lm(crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + docs_density + region, data = crime_df)
summary(fit_1)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad + 
##     beds_density_1000 + docs_density + region, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -73.758 -10.480  -0.863   9.935 217.191 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.430e+01  1.831e+01  -0.781   0.4352    
## pop18              6.297e-01  2.662e-01   2.366   0.0184 *  
## totalinc           4.760e-04  7.994e-05   5.955 5.42e-09 ***
## poverty            1.844e+00  3.590e-01   5.137 4.25e-07 ***
## hsgrad             1.173e-01  2.311e-01   0.508   0.6119    
## beds_density_1000  2.541e+00  8.287e-01   3.067   0.0023 ** 
## docs_density       1.688e+03  1.024e+03   1.649   0.0998 .  
## region2            6.283e+00  2.995e+00   2.098   0.0365 *  
## region3            2.172e+01  2.829e+00   7.676 1.12e-13 ***
## region4            1.583e+01  3.489e+00   4.538 7.38e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.59 on 430 degrees of freedom
## Multiple R-squared:  0.444,  Adjusted R-squared:  0.4323 
## F-statistic: 38.15 on 9 and 430 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_1, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    430 182292                                  
## 2    429 138602  1     43690 135.23 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reject H0, large model is superior

no region

fit_2 = lm(crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + docs_density + pop_density, data = crime_df)
summary(fit_2)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad + 
##     beds_density_1000 + docs_density + pop_density, data = crime_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -60.71 -12.33  -2.50  12.46  68.43 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -3.758e+01  1.746e+01  -2.153  0.03190 *  
## pop18              5.140e-01  2.572e-01   1.998  0.04631 *  
## totalinc           2.523e-04  8.098e-05   3.115  0.00196 ** 
## poverty            2.798e+00  3.176e-01   8.809  < 2e-16 ***
## hsgrad             5.517e-01  2.137e-01   2.582  0.01015 *  
## beds_density_1000  1.805e+00  7.544e-01   2.393  0.01713 *  
## docs_density       2.276e+02  9.981e+02   0.228  0.81972    
## pop_density        4.433e-03  4.927e-04   8.997  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.28 on 432 degrees of freedom
## Multiple R-squared:  0.4582, Adjusted R-squared:  0.4494 
## F-statistic: 52.19 on 7 and 432 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_2, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    432 177630                                  
## 2    429 138602  3     39028 40.266 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reject H0, large model is superior

no docs_density

fit_3 = lm(crm_1000 ~  pop18 + totalinc + poverty + hsgrad + beds_density_1000 + region + pop_density, data = crime_df)
summary(fit_3)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad + 
##     beds_density_1000 + region + pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.426 -11.302  -0.614  10.041  75.789 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.218e+01  1.535e+01  -1.445 0.149295    
## pop18              3.911e-01  2.281e-01   1.715 0.087128 .  
## totalinc           2.406e-04  7.146e-05   3.367 0.000827 ***
## poverty            1.624e+00  3.124e-01   5.197 3.13e-07 ***
## hsgrad             3.007e-01  1.977e-01   1.521 0.129104    
## beds_density_1000  2.626e+00  4.976e-01   5.277 2.08e-07 ***
## region2            9.246e+00  2.587e+00   3.574 0.000391 ***
## region3            2.588e+01  2.493e+00  10.378  < 2e-16 ***
## region4            2.056e+01  3.061e+00   6.718 5.85e-11 ***
## pop_density        5.094e-03  4.325e-04  11.777  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.96 on 430 degrees of freedom
## Multiple R-squared:  0.5769, Adjusted R-squared:  0.5681 
## F-statistic: 65.15 on 9 and 430 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_3, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     region + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    430 138704                           
## 2    429 138602  1    102.01 0.3157 0.5745

Fail to reject H0, large model is not superior, we should keep the smaller one.

no beds_density

fit_4 = lm(crm_1000 ~ pop18 + totalinc + poverty + hsgrad + region + pop_density + docs_density, data = crime_df)
summary(fit_4)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + hsgrad + 
##     region + pop_density + docs_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.950 -11.572  -0.876  10.949  79.338 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.410e+01  1.607e+01  -0.877 0.380758    
## pop18         2.357e-01  2.327e-01   1.013 0.311786    
## totalinc      2.153e-04  7.333e-05   2.936 0.003508 ** 
## poverty       2.010e+00  3.031e-01   6.631 1.00e-10 ***
## hsgrad        2.801e-01  2.058e-01   1.361 0.174175    
## region2       1.074e+01  2.635e+00   4.078 5.42e-05 ***
## region3       2.566e+01  2.539e+00  10.106  < 2e-16 ***
## region4       1.784e+01  3.044e+00   5.862 9.10e-09 ***
## pop_density   5.065e-03  4.498e-04  11.260  < 2e-16 ***
## docs_density  2.162e+03  6.392e+02   3.382 0.000785 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.29 on 430 degrees of freedom
## Multiple R-squared:  0.5612, Adjusted R-squared:  0.552 
## F-statistic:  61.1 on 9 and 430 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_4, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + region + pop_density + 
##     docs_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    430 143860                                  
## 2    429 138602  1    5258.4 16.276 6.483e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reject H0, large model is superior

no hsgrad

fit_5 = lm(crm_1000 ~ pop18 + totalinc + poverty + region + pop_density + docs_density + beds_density_1000, data = crime_df)
summary(fit_5)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + region + 
##     pop_density + docs_density + beds_density_1000, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.333 -11.471  -0.716  10.299  73.846 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.209e+00  6.470e+00  -0.187 0.851804    
## pop18              5.475e-01  2.190e-01   2.501 0.012769 *  
## totalinc           2.403e-04  7.250e-05   3.314 0.000998 ***
## poverty            1.262e+00  2.292e-01   5.505 6.37e-08 ***
## region2            1.001e+01  2.553e+00   3.922 0.000102 ***
## region3            2.631e+01  2.484e+00  10.591  < 2e-16 ***
## region4            2.227e+01  2.922e+00   7.619 1.64e-13 ***
## pop_density        5.083e-03  4.415e-04  11.511  < 2e-16 ***
## docs_density      -2.024e+02  8.942e+02  -0.226 0.821077    
## beds_density_1000  2.858e+00  7.244e-01   3.945 9.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.01 on 430 degrees of freedom
## Multiple R-squared:  0.5747, Adjusted R-squared:  0.5658 
## F-statistic: 64.56 on 9 and 430 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_5, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + totalinc + poverty + region + pop_density + 
##     docs_density + beds_density_1000
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    430 139433                           
## 2    429 138602  1    831.22 2.5728 0.1095

Fail to reject H0, large model is not superior, we should keep the smaller one.

no poverty

fit_6 = lm(crm_1000 ~ pop18 + totalinc + region + pop_density + docs_density + beds_density_1000 + hsgrad, data = crime_df)
summary(fit_6)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + region + pop_density + 
##     docs_density + beds_density_1000 + hsgrad, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.197 -11.670  -0.842  10.300  76.758 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.777e+01  1.265e+01   2.195 0.028675 *  
## pop18              7.721e-01  2.290e-01   3.372 0.000814 ***
## totalinc           2.260e-04  7.447e-05   3.035 0.002551 ** 
## region2            1.154e+01  2.652e+00   4.349 1.71e-05 ***
## region3            2.973e+01  2.447e+00  12.147  < 2e-16 ***
## region4            2.749e+01  2.854e+00   9.631  < 2e-16 ***
## pop_density        5.292e-03  4.543e-04  11.650  < 2e-16 ***
## docs_density      -9.572e+02  9.355e+02  -1.023 0.306767    
## beds_density_1000  4.099e+00  7.064e-01   5.802 1.27e-08 ***
## hsgrad            -3.849e-01  1.518e-01  -2.536 0.011564 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.49 on 430 degrees of freedom
## Multiple R-squared:  0.5514, Adjusted R-squared:  0.5421 
## F-statistic: 58.74 on 9 and 430 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_6, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + totalinc + region + pop_density + docs_density + 
##     beds_density_1000 + hsgrad
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    430 147059                                  
## 2    429 138602  1    8456.7 26.175 4.712e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reject H0, large model is superior

no totalinc

fit_7 = lm(crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density, data = crime_df)
summary(fit_7)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.252 -11.245  -0.797   9.991  75.301 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.002e+01  1.615e+01  -1.240 0.215822    
## pop18              4.302e-01  2.359e-01   1.824 0.068902 .  
## poverty            1.545e+00  3.174e-01   4.868 1.58e-06 ***
## hsgrad             2.829e-01  2.046e-01   1.382 0.167557    
## beds_density_1000  2.651e+00  7.286e-01   3.638 0.000308 ***
## docs_density      -2.649e+00  9.121e+02  -0.003 0.997684    
## region2            9.215e+00  2.657e+00   3.469 0.000576 ***
## region3            2.576e+01  2.526e+00  10.198  < 2e-16 ***
## region4            2.189e+01  3.092e+00   7.077 6.01e-12 ***
## pop_density        5.555e-03  4.311e-04  12.884  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.2 on 430 degrees of freedom
## Multiple R-squared:  0.5658, Adjusted R-squared:  0.5567 
## F-statistic: 62.25 on 9 and 430 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_7, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 + docs_density + 
##     region + pop_density
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    430 142362                                  
## 2    429 138602  1    3759.8 11.637 0.0007077 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reject H0, large model is superior.

no pop18

fit_8 = lm(crm_1000 ~ poverty + hsgrad + beds_density_1000 + docs_density + region + pop_density + totalinc, data = crime_df)
summary(fit_7)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.252 -11.245  -0.797   9.991  75.301 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.002e+01  1.615e+01  -1.240 0.215822    
## pop18              4.302e-01  2.359e-01   1.824 0.068902 .  
## poverty            1.545e+00  3.174e-01   4.868 1.58e-06 ***
## hsgrad             2.829e-01  2.046e-01   1.382 0.167557    
## beds_density_1000  2.651e+00  7.286e-01   3.638 0.000308 ***
## docs_density      -2.649e+00  9.121e+02  -0.003 0.997684    
## region2            9.215e+00  2.657e+00   3.469 0.000576 ***
## region3            2.576e+01  2.526e+00  10.198  < 2e-16 ***
## region4            2.189e+01  3.092e+00   7.077 6.01e-12 ***
## pop_density        5.555e-03  4.311e-04  12.884  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.2 on 430 degrees of freedom
## Multiple R-squared:  0.5658, Adjusted R-squared:  0.5567 
## F-statistic: 62.25 on 9 and 430 DF,  p-value: < 2.2e-16
# compare nested (small vs large) models
# Ho: smaller model is defensible
anova(fit_8, fit_no_inte)
## Analysis of Variance Table
## 
## Model 1: crm_1000 ~ poverty + hsgrad + beds_density_1000 + docs_density + 
##     region + pop_density + totalinc
## Model 2: crm_1000 ~ pop18 + totalinc + poverty + hsgrad + beds_density_1000 + 
##     docs_density + region + pop_density
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)  
## 1    430 139639                             
## 2    429 138602  1      1037 3.2099 0.0739 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fail to reject H0, large model is not superior, we should keep the smaller one.

Interations

Model 1

fiti <- lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_df)
summary(fiti)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + beds_density_1000 + 
##     region + pop_density + pop18 * pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.315 -11.345  -0.571  10.215  73.941 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.201e+01  6.699e+00  -1.792  0.07383 .  
## pop18              8.962e-01  2.213e-01   4.050 6.08e-05 ***
## totalinc           2.018e-04  7.064e-05   2.856  0.00449 ** 
## poverty            1.208e+00  2.119e-01   5.700 2.23e-08 ***
## beds_density_1000  2.982e+00  4.866e-01   6.129 2.01e-09 ***
## region2            1.001e+01  2.482e+00   4.032 6.55e-05 ***
## region3            2.725e+01  2.443e+00  11.155  < 2e-16 ***
## region4            2.350e+01  2.841e+00   8.273 1.65e-15 ***
## pop_density        1.984e-02  3.469e-03   5.721 1.99e-08 ***
## pop18:pop_density -4.872e-04  1.135e-04  -4.293 2.18e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.63 on 430 degrees of freedom
## Multiple R-squared:  0.5921, Adjusted R-squared:  0.5836 
## F-statistic: 69.36 on 9 and 430 DF,  p-value: < 2.2e-16
interact_plot(fiti, pred = pop18, modx = pop_density)
## Warning: -1306.28434985964 is outside the observed range of pop_density

Backward Model

fiti <- lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
    totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_df)
summary(fiti)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
##     totalinc + region + pop_density + beds_density_1000 + bagrad * 
##     pcincome + bagrad * pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.147 -10.177  -0.915   9.618  60.267 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -8.990e+01  1.647e+01  -5.457 8.23e-08 ***
## pop18               7.242e-01  2.751e-01   2.633 0.008778 ** 
## bagrad              1.867e+00  5.021e-01   3.718 0.000228 ***
## poverty             2.181e+00  2.906e-01   7.503 3.65e-13 ***
## pcincome            4.350e-03  7.999e-04   5.439 9.06e-08 ***
## totalinc            8.373e-05  7.336e-05   1.141 0.254364    
## region2             1.216e+01  2.420e+00   5.024 7.46e-07 ***
## region3             2.912e+01  2.418e+00  12.044  < 2e-16 ***
## region4             2.447e+01  2.834e+00   8.634  < 2e-16 ***
## pop_density         9.032e-03  1.107e-03   8.161 3.76e-15 ***
## beds_density_1000   1.940e+00  4.973e-01   3.901 0.000111 ***
## bagrad:pcincome    -1.006e-04  2.348e-05  -4.283 2.28e-05 ***
## bagrad:pop_density -1.960e-04  4.872e-05  -4.023 6.80e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.91 on 427 degrees of freedom
## Multiple R-squared:  0.6276, Adjusted R-squared:  0.6171 
## F-statistic: 59.96 on 12 and 427 DF,  p-value: < 2.2e-16
interact_plot(fiti, pred = bagrad, modx = pcincome)

interact_plot(fiti, pred = bagrad, modx = pop_density)
## Warning: -1306.28434985964 is outside the observed range of pop_density

Candidate models

Thus the four candidate regression model should be:

model_1 = lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_df)

model_2 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)

model_3 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_df)

Model Diagnostics

Boxcox Transformation

Model1

bc <- boxcox(model_1, lambda = seq(-5, 5, by = 0.25))

lambda <- bc$x[which.max(bc$y)]
#choose power 1/2

crime_df$crm.bc <- crime_df$crm_1000^(1/2)
model_bc1 = lm(crm.bc ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_df)

par(mfrow = c(1,2))
plot(model_1, which = 2)
plot(model_bc1, which = 2)

summary(model_1)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + totalinc + poverty + beds_density_1000 + 
##     region + pop_density + pop18 * pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.315 -11.345  -0.571  10.215  73.941 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.201e+01  6.699e+00  -1.792  0.07383 .  
## pop18              8.962e-01  2.213e-01   4.050 6.08e-05 ***
## totalinc           2.018e-04  7.064e-05   2.856  0.00449 ** 
## poverty            1.208e+00  2.119e-01   5.700 2.23e-08 ***
## beds_density_1000  2.982e+00  4.866e-01   6.129 2.01e-09 ***
## region2            1.001e+01  2.482e+00   4.032 6.55e-05 ***
## region3            2.725e+01  2.443e+00  11.155  < 2e-16 ***
## region4            2.350e+01  2.841e+00   8.273 1.65e-15 ***
## pop_density        1.984e-02  3.469e-03   5.721 1.99e-08 ***
## pop18:pop_density -4.872e-04  1.135e-04  -4.293 2.18e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.63 on 430 degrees of freedom
## Multiple R-squared:  0.5921, Adjusted R-squared:  0.5836 
## F-statistic: 69.36 on 9 and 430 DF,  p-value: < 2.2e-16
summary(model_bc1)
## 
## Call:
## lm(formula = crm.bc ~ pop18 + totalinc + poverty + beds_density_1000 + 
##     region + pop_density + pop18 * pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1476 -0.6532  0.0468  0.7784  4.4936 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.944e+00  4.482e-01   6.570 1.46e-10 ***
## pop18              5.472e-02  1.480e-02   3.696 0.000247 ***
## totalinc           1.683e-05  4.726e-06   3.562 0.000410 ***
## poverty            7.121e-02  1.418e-02   5.022 7.49e-07 ***
## beds_density_1000  2.076e-01  3.255e-02   6.377 4.67e-10 ***
## region2            7.061e-01  1.660e-01   4.253 2.59e-05 ***
## region3            1.898e+00  1.634e-01  11.614  < 2e-16 ***
## region4            1.727e+00  1.900e-01   9.085  < 2e-16 ***
## pop_density        8.191e-04  2.320e-04   3.530 0.000460 ***
## pop18:pop_density -1.923e-05  7.592e-06  -2.534 0.011645 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.18 on 430 degrees of freedom
## Multiple R-squared:  0.5486, Adjusted R-squared:  0.5391 
## F-statistic: 58.06 on 9 and 430 DF,  p-value: < 2.2e-16

Model2

bc <- boxcox(model_2, lambda = seq(-5, 5, by = 0.25))

lambda <- bc$x[which.max(bc$y)]
#choose power 1/2

crime_df$crm.bc <- crime_df$crm_1000^(1/2)
model_bc2 = lm(crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = crime_df)

par(mfrow = c(1,2))
plot(model_2, which = 2)
plot(model_bc2, which = 2)

summary(model_2)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
##     totalinc + region + pop_density + beds_density_1000, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.532 -11.110  -0.727   9.967  79.427 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.959e+01  1.214e+01  -1.614  0.10728    
## pop18              7.508e-01  2.864e-01   2.622  0.00906 ** 
## bagrad            -2.184e-01  2.226e-01  -0.981  0.32709    
## poverty            1.569e+00  2.849e-01   5.507 6.30e-08 ***
## pcincome           8.141e-04  4.547e-04   1.790  0.07413 .  
## totalinc           1.886e-04  7.639e-05   2.469  0.01394 *  
## region2            1.085e+01  2.563e+00   4.234 2.81e-05 ***
## region3            2.704e+01  2.544e+00  10.629  < 2e-16 ***
## region4            2.308e+01  2.980e+00   7.743 7.05e-14 ***
## pop_density        4.843e-03  4.491e-04  10.784  < 2e-16 ***
## beds_density_1000  2.577e+00  5.046e-01   5.108 4.92e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.96 on 429 degrees of freedom
## Multiple R-squared:  0.578,  Adjusted R-squared:  0.5682 
## F-statistic: 58.76 on 10 and 429 DF,  p-value: < 2.2e-16
summary(model_bc2)
## 
## Call:
## lm(formula = crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc + 
##     region + pop_density + beds_density_1000, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1813 -0.6894  0.0230  0.7638  4.0506 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.465e+00  7.963e-01   1.840  0.06653 .  
## pop18              6.222e-02  1.879e-02   3.311  0.00101 ** 
## bagrad            -2.174e-02  1.460e-02  -1.489  0.13735    
## poverty            1.043e-01  1.869e-02   5.579 4.30e-08 ***
## pcincome           8.347e-05  2.984e-05   2.797  0.00539 ** 
## totalinc           1.317e-05  5.013e-06   2.627  0.00893 ** 
## region2            7.888e-01  1.682e-01   4.690 3.68e-06 ***
## region3            1.935e+00  1.669e-01  11.589  < 2e-16 ***
## region4            1.766e+00  1.956e-01   9.029  < 2e-16 ***
## pop_density        2.128e-04  2.947e-05   7.222 2.35e-12 ***
## beds_density_1000  1.811e-01  3.311e-02   5.470 7.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.178 on 429 degrees of freedom
## Multiple R-squared:  0.5506, Adjusted R-squared:  0.5402 
## F-statistic: 52.57 on 10 and 429 DF,  p-value: < 2.2e-16

Model3

bc <- boxcox(model_3, lambda = seq(-5, 5, by = 0.25))

lambda <- bc$x[which.max(bc$y)]
#choose power 1/2

crime_df$crm.bc <- crime_df$crm_1000^(1/2)
model_bc3 = lm(crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_df)

par(mfrow = c(1,2))
plot(model_3, which = 2)
plot(model_bc3, which = 2)

summary(model_3)
## 
## Call:
## lm(formula = crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
##     totalinc + region + pop_density + beds_density_1000 + bagrad * 
##     pcincome + bagrad * pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.147 -10.177  -0.915   9.618  60.267 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -8.990e+01  1.647e+01  -5.457 8.23e-08 ***
## pop18               7.242e-01  2.751e-01   2.633 0.008778 ** 
## bagrad              1.867e+00  5.021e-01   3.718 0.000228 ***
## poverty             2.181e+00  2.906e-01   7.503 3.65e-13 ***
## pcincome            4.350e-03  7.999e-04   5.439 9.06e-08 ***
## totalinc            8.373e-05  7.336e-05   1.141 0.254364    
## region2             1.216e+01  2.420e+00   5.024 7.46e-07 ***
## region3             2.912e+01  2.418e+00  12.044  < 2e-16 ***
## region4             2.447e+01  2.834e+00   8.634  < 2e-16 ***
## pop_density         9.032e-03  1.107e-03   8.161 3.76e-15 ***
## beds_density_1000   1.940e+00  4.973e-01   3.901 0.000111 ***
## bagrad:pcincome    -1.006e-04  2.348e-05  -4.283 2.28e-05 ***
## bagrad:pop_density -1.960e-04  4.872e-05  -4.023 6.80e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.91 on 427 degrees of freedom
## Multiple R-squared:  0.6276, Adjusted R-squared:  0.6171 
## F-statistic: 59.96 on 12 and 427 DF,  p-value: < 2.2e-16
summary(model_bc3)
## 
## Call:
## lm(formula = crm.bc ~ pop18 + bagrad + poverty + pcincome + totalinc + 
##     region + pop_density + beds_density_1000 + bagrad * pcincome + 
##     bagrad * pop_density, data = crime_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0257 -0.6925  0.0322  0.6745  3.5136 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.173e+00  1.098e+00  -2.889 0.004062 ** 
## pop18               5.539e-02  1.834e-02   3.021 0.002672 ** 
## bagrad              1.301e-01  3.347e-02   3.887 0.000118 ***
## poverty             1.459e-01  1.937e-02   7.533 2.99e-13 ***
## pcincome            3.282e-04  5.332e-05   6.156 1.72e-09 ***
## totalinc            6.965e-06  4.890e-06   1.424 0.155068    
## region2             8.582e-01  1.613e-01   5.320 1.68e-07 ***
## region3             2.033e+00  1.612e-01  12.612  < 2e-16 ***
## region4             1.809e+00  1.889e-01   9.573  < 2e-16 ***
## pop_density         3.613e-04  7.377e-05   4.898 1.38e-06 ***
## beds_density_1000   1.338e-01  3.315e-02   4.036 6.45e-05 ***
## bagrad:pcincome    -7.504e-06  1.565e-06  -4.794 2.26e-06 ***
## bagrad:pop_density -6.847e-06  3.248e-06  -2.108 0.035588 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.127 on 427 degrees of freedom
## Multiple R-squared:  0.5907, Adjusted R-squared:  0.5792 
## F-statistic: 51.36 on 12 and 427 DF,  p-value: < 2.2e-16

Residuals

plot(model_1, which = 1)

plot(model_2, which = 1)

plot(model_3, which = 1)

observation #6, #215

Normal QQ plots

Filter out influential observations

plot(model_1, which = 2)

plot(model_2, which = 2)

plot(model_3, which = 2)

The observation #6, #215, #123is an outlier in all three

Scale and locations

plot(model_1, which = 3)

plot(model_2, which = 3)

plot(model_3, which = 3)

The observation #123, #6, #215 is identified

Outliers and Leverage

plot(model_1, which = 5)

plot(model_2, which = 5)

plot(model_3, which = 5)

observation #1, #6

plot(model_1, which = 6)

plot(model_2, which = 6)

plot(model_3, which = 6)

#1, #123, #6

The observation #6 and #1, is beyond the Cook’s distance lines of 0.5 (> 1). The plot identified the influential observation as #6, #1

crime_df %>%
  filter(row(crime_df) == 6 | row(crime_df) == 215 )
## # A tibble: 2 × 16
##   crm_1000 cty   state pop18 pop65 hsgrad bagrad poverty unemp pcincome totalinc
##      <dbl> <chr> <chr> <dbl> <dbl>  <dbl>  <dbl>   <dbl> <dbl>    <dbl>    <dbl>
## 1     296. Kings NY     28.3  12.4   63.7   16.6    19.5   9.5    16803    38658
## 2     112. Atla… NJ     29    14.5   72.9   16.4     6.4   8.3    24035     5392
## # … with 5 more variables: region <fct>, pop_density <dbl>,
## #   beds_density_1000 <dbl>, docs_density <dbl>, crm.bc <dbl>
crime_after <- crime_df %>%
  filter(row(crime_df) != 6)

Colinearility

# Correlation matrix for all variables
temp <- crime_after %>%
  dplyr::select(-state, -cty, -region)

cor(temp)
##                      crm_1000       pop18       pop65      hsgrad      bagrad
## crm_1000           1.00000000  0.21111276 -0.07448047 -0.20671429  0.05501983
## pop18              0.21111276  1.00000000 -0.61630643  0.25141951  0.45619208
## pop65             -0.07448047 -0.61630643  1.00000000 -0.26919500 -0.33928575
## hsgrad            -0.20671429  0.25141951 -0.26919500  1.00000000  0.70858677
## bagrad             0.05501983  0.45619208 -0.33928575  0.70858677  1.00000000
## poverty            0.47132275  0.03452596  0.00631249 -0.68859004 -0.40799186
## unemp              0.01882935 -0.27883812  0.23656367 -0.59167432 -0.54041041
## pcincome          -0.07881273 -0.03171871  0.01865180  0.52349172  0.69520378
## totalinc           0.19993547  0.07198201 -0.02319956  0.05473613  0.22699684
## pop_density        0.29355413  0.17535045  0.03751855 -0.05416937  0.24037288
## beds_density_1000  0.39845317  0.02954235  0.24713716 -0.21157574 -0.04527816
## docs_density       0.33806678  0.23702815  0.01860966  0.14338706  0.44121009
## crm.bc             0.98705951  0.20599865 -0.07654989 -0.19181575  0.05951478
##                       poverty       unemp    pcincome     totalinc pop_density
## crm_1000           0.47132275  0.01882935 -0.07881273  0.199935465  0.29355413
## pop18              0.03452596 -0.27883812 -0.03171871  0.071982012  0.17535045
## pop65              0.00631249  0.23656367  0.01865180 -0.023199555  0.03751855
## hsgrad            -0.68859004 -0.59167432  0.52349172  0.054736134 -0.05416937
## bagrad            -0.40799186 -0.54041041  0.69520378  0.226996843  0.24037288
## poverty            1.00000000  0.43380542 -0.60326536 -0.052025537  0.07001134
## unemp              0.43380542  1.00000000 -0.32155149 -0.040991522 -0.02478102
## pcincome          -0.60326536 -0.32155149  1.00000000  0.352424960  0.34018837
## totalinc          -0.05202554 -0.04099152  0.35242496  1.000000000  0.32911874
## pop_density        0.07001134 -0.02478102  0.34018837  0.329118735  1.00000000
## beds_density_1000  0.37306692 -0.06293606 -0.05344502  0.005714199  0.27840087
## docs_density       0.06413325 -0.24830541  0.36011639  0.200450879  0.43747995
## crm.bc             0.45355488  0.01411476 -0.07597748  0.198626376  0.27465972
##                   beds_density_1000 docs_density      crm.bc
## crm_1000                0.398453170   0.33806678  0.98705951
## pop18                   0.029542349   0.23702815  0.20599865
## pop65                   0.247137160   0.01860966 -0.07654989
## hsgrad                 -0.211575741   0.14338706 -0.19181575
## bagrad                 -0.045278162   0.44121009  0.05951478
## poverty                 0.373066922   0.06413325  0.45355488
## unemp                  -0.062936062  -0.24830541  0.01411476
## pcincome               -0.053445023   0.36011639 -0.07597748
## totalinc                0.005714199   0.20045088  0.19862638
## pop_density             0.278400865   0.43747995  0.27465972
## beds_density_1000       1.000000000   0.66670719  0.36861627
## docs_density            0.666707186   1.00000000  0.31512464
## crm.bc                  0.368616267   0.31512464  1.00000000
crime_after %>% 
  dplyr::select(-crm_1000) %>% 
  ggcorr(label = TRUE, label_size = 2, hjust = 0.8)
## Warning in ggcorr(., label = TRUE, label_size = 2, hjust = 0.8): data in
## column(s) 'cty', 'state', 'region' are not numeric and were ignored

The correlation plot suggest high correlation between beds and docs, beds and totalinc, docs and totalinc.

Let’s check if the model violates colienarity

Calculate VIF w/o interaction terms

model_1_new = lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density, data = crime_after)

model_2_new = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
    totalinc + region + pop_density + beds_density_1000, data = crime_after)

model_3_new = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
    totalinc + region + pop_density + beds_density_1000, data = crime_after)

check_collinearity(model_1_new)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##               Term  VIF Increased SE Tolerance
##              pop18 1.04         1.02      0.96
##           totalinc 1.00         1.00      1.00
##            poverty 1.36         1.17      0.74
##  beds_density_1000 1.36         1.17      0.73
##             region 1.31         1.14      0.77
##        pop_density 1.11         1.05      0.90
check_collinearity(model_2_new)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##               Term  VIF Increased SE Tolerance
##              pop18 1.97         1.40      0.51
##             bagrad 2.12         1.46      0.47
##            poverty 1.35         1.16      0.74
##           pcincome 1.10         1.05      0.91
##           totalinc 1.00         1.00      1.00
##             region 1.34         1.16      0.74
##        pop_density 1.05         1.03      0.95
##  beds_density_1000 1.32         1.15      0.76
check_collinearity(model_3_new)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##               Term  VIF Increased SE Tolerance
##              pop18 1.97         1.40      0.51
##             bagrad 2.12         1.46      0.47
##            poverty 1.35         1.16      0.74
##           pcincome 1.10         1.05      0.91
##           totalinc 1.00         1.00      1.00
##             region 1.34         1.16      0.74
##        pop_density 1.05         1.03      0.95
##  beds_density_1000 1.32         1.15      0.76

There is no multicolinearity in our models

Final models

model_1 = lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = crime_after)

model_2 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
    totalinc + region + pop_density + beds_density_1000, data = crime_after)

model_3 = lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + 
    totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = crime_after)

model_1 %>% broom::tidy()
## # A tibble: 10 × 5
##    term               estimate std.error statistic  p.value
##    <chr>                 <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)       -8.06     6.62          -1.22 2.24e- 1
##  2 pop18              0.781    0.218          3.58 3.90e- 4
##  3 totalinc           0.000282 0.0000715      3.94 9.64e- 5
##  4 poverty            1.14     0.208          5.47 7.65e- 8
##  5 beds_density_1000  3.40     0.486          7.00 1.01e-11
##  6 region2            9.62     2.43           3.96 8.92e- 5
##  7 region3           26.7      2.40          11.1  1.77e-25
##  8 region4           22.7      2.79           8.16 3.88e-15
##  9 pop_density        0.00763  0.00439        1.74 8.30e- 2
## 10 pop18:pop_density -0.000157 0.000134      -1.17 2.44e- 1
model_2 %>% broom::tidy()
## # A tibble: 11 × 5
##    term                estimate  std.error statistic  p.value
##    <chr>                  <dbl>      <dbl>     <dbl>    <dbl>
##  1 (Intercept)       -43.8      12.0           -3.64 3.03e- 4
##  2 pop18               1.12      0.277          4.06 5.81e- 5
##  3 bagrad             -0.406     0.213         -1.91 5.71e- 2
##  4 poverty             1.74      0.271          6.41 3.85e-10
##  5 pcincome            0.00165   0.000447       3.69 2.55e- 4
##  6 totalinc            0.000211  0.0000725      2.91 3.80e- 3
##  7 region2            11.1       2.43           4.56 6.55e- 6
##  8 region3            27.8       2.41          11.5  6.09e-27
##  9 region4            24.1       2.83           8.52 2.77e-16
## 10 pop_density         0.00164   0.000625       2.62 9.09e- 3
## 11 beds_density_1000   3.19      0.486          6.55 1.61e-10
model_3 %>% broom::tidy()
## # A tibble: 13 × 5
##    term                   estimate  std.error statistic  p.value
##    <chr>                     <dbl>      <dbl>     <dbl>    <dbl>
##  1 (Intercept)        -110.        16.4          -6.73  5.65e-11
##  2 pop18                 0.919      0.269         3.42  6.86e- 4
##  3 bagrad                2.05       0.487         4.21  3.07e- 5
##  4 poverty               2.36       0.283         8.32  1.21e-15
##  5 pcincome              0.00535    0.000796      6.72  5.87e-11
##  6 totalinc              0.000126   0.0000715     1.76  7.96e- 2
##  7 region2              11.9        2.34          5.07  5.89e- 7
##  8 region3              28.7        2.34         12.3   8.86e-30
##  9 region4              24.0        2.75          8.75  5.17e-17
## 10 pop_density           0.00255    0.00161       1.59  1.13e- 1
## 11 beds_density_1000     2.35       0.488         4.83  1.94e- 6
## 12 bagrad:pcincome      -0.000123   0.0000231    -5.31  1.77e- 7
## 13 bagrad:pop_density   -0.0000245  0.0000568    -0.431 6.67e- 1

Cross validation

set.seed(7)

cv_df =
  crossv_mc(crime_df, 100) %>% 
  mutate(
    train = map(train, as_tibble),
    test = map(test, as_tibble)
  ) %>% 
  mutate(
    model_1 = map(train, ~lm(crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density, data = .x)),
    model_2 = map(train, ~lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000, data = .x)),
    model_3 = map(train, ~lm(crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density, data = .x))
  ) %>% 
  mutate(
    rmse_1 = map2_dbl(model_1, test, ~rmse(model = .x, data = .y)),
    rmse_2 = map2_dbl(model_2, test, ~rmse(model = .x, data = .y)),
    rmse_3 = map2_dbl(model_3, test, ~rmse(model = .x, data = .y))
  )
  
cv_df
## # A tibble: 100 × 9
##    train               test   .id   model_1 model_2 model_3 rmse_1 rmse_2 rmse_3
##    <list>              <list> <chr> <list>  <list>  <list>   <dbl>  <dbl>  <dbl>
##  1 <tibble [351 × 16]> <tibb… 001   <lm>    <lm>    <lm>      15.4   15.9   15.5
##  2 <tibble [351 × 16]> <tibb… 002   <lm>    <lm>    <lm>      17.9   18.4   18.3
##  3 <tibble [351 × 16]> <tibb… 003   <lm>    <lm>    <lm>      19.3   23.8   21.0
##  4 <tibble [351 × 16]> <tibb… 004   <lm>    <lm>    <lm>      20.7   20.4   20.0
##  5 <tibble [351 × 16]> <tibb… 005   <lm>    <lm>    <lm>      20.4   24.0   22.8
##  6 <tibble [351 × 16]> <tibb… 006   <lm>    <lm>    <lm>      18.4   17.9   17.5
##  7 <tibble [351 × 16]> <tibb… 007   <lm>    <lm>    <lm>      21.8   23.4   20.6
##  8 <tibble [351 × 16]> <tibb… 008   <lm>    <lm>    <lm>      18.3   18.5   17.1
##  9 <tibble [351 × 16]> <tibb… 009   <lm>    <lm>    <lm>      16.4   16.5   15.3
## 10 <tibble [351 × 16]> <tibb… 010   <lm>    <lm>    <lm>      20.4   25.7   22.9
## # … with 90 more rows
cv_df %>% 
  dplyr::select(rmse_1:rmse_3) %>% 
  pivot_longer(
    rmse_1:rmse_3,
    names_to = "model",
    values_to = "rmse",
    names_prefix = "rmse_"
  ) %>% 
  ggplot(aes(x = model, y = rmse)) +
  geom_boxplot() +
  labs(
    x = "Model",
    y = "RMSE"
  )

set.seed(2021)

# Use 5-fold validation and create the training sets
train = trainControl(method = "repeatedcv", number = 5, repeats = 100)

# Fit the 4-variables model that we discussed in previous lectures
model_caret_1 = 
  train(
    crm_1000 ~ pop18 +totalinc + poverty + beds_density_1000 + region + pop_density + pop18*pop_density,
    data = crime_df,
    trControl = train,
    method = 'lm',
    na.action = na.pass
  )

model_caret_2 = 
  train(
    crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000,
    data = crime_df,
    trControl = train,
    method = 'lm',
    na.action = na.pass
  )

model_caret_3 = 
  train(
    crm_1000 ~ pop18 + bagrad + poverty + pcincome + totalinc + region + pop_density + beds_density_1000 + bagrad*pcincome + bagrad*pop_density,
    data = crime_df,
    trControl = train,
    method = 'lm',
    na.action = na.pass
  )

bind_rows(
  as_tibble(model_caret_1$results),
  as_tibble(model_caret_2$results),
  as_tibble(model_caret_3$results),
) %>% 
  knitr::kable(digit = 3)
intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
TRUE 18.922 0.518 13.999 2.415 0.099 1.137
TRUE 19.747 0.489 14.155 3.382 0.070 1.176
TRUE 18.520 0.545 13.290 2.925 0.073 1.119